Load data
load("01.1_qc_checks/phyloseq_otu16m_mat_filt_20_thr10.rds")
load("01.1_qc_checks/phyloseq_otu16n_mat_filt_20_thr10.rds")
load("01.1_qc_checks/phyloseq_otu18m_nohost_mat_filt_20_thr10.rds")
load("01.1_qc_checks/phyloseq_otu18n_nohost_mat_filt_20_thr10.rds")
# format data
phyloseq16m <- prune_samples(sample_sums(phyloseq_otu16m_mat_filt_20_thr10) >0, phyloseq_otu16m_mat_filt_20_thr10)
phyloseq16n <- prune_samples(sample_sums(phyloseq_otu16n_mat_filt_20_thr10) >0, phyloseq_otu16n_mat_filt_20_thr10)
phyloseq18m <- prune_samples(sample_sums(phyloseq_otu18m_nohost_mat_filt_20_thr10) >0, phyloseq_otu18m_nohost_mat_filt_20_thr10)
phyloseq18n <- prune_samples(sample_sums(phyloseq_otu18n_nohost_mat_filt_20_thr10) >0, phyloseq_otu18n_nohost_mat_filt_20_thr10)
rm(phyloseq_otu16m_mat_filt_20_thr10, phyloseq_otu16n_mat_filt_20_thr10,
phyloseq_otu18m_nohost_mat_filt_20_thr10, phyloseq_otu18n_nohost_mat_filt_20_thr10)
phyloseq_list <- list(
phyloseq16m = phyloseq16m,
phyloseq16n = phyloseq16n,
phyloseq18m = phyloseq18m,
phyloseq18n = phyloseq18n)
# rename wild cap groups
for (i in seq_along(phyloseq_list)) {
phy <- phyloseq_list[[i]]
sd <- as.data.frame(sample_data(phy))
sd$captive_wild <- factor(sd$Captive.Wild,
levels = c("Captive", "Wild, free ranging", "Wild, seized from traffickers"),
labels = c("Captive", "Wild", "Seized"))
sd$captive_wild_pro <- ifelse(sd$captive_wild == "Seized" & sd$Given.probiotic == "Yes", "Seized Probiotics",
ifelse(sd$captive_wild == "Seized" & sd$Given.probiotic != "Yes", "Seized No Probiotics",
as.character(sd$captive_wild)))
sample_data(phy) <- sample_data(sd)
phyloseq_list[[i]] <- phy
}
rm(phyloseq16m, phyloseq16n, phyloseq18m, phyloseq18n, phy)
Set plot themes and colors for throughout
colors <- c("#493D9E","#5D8233","#ECD662")
colorlist <- c(
"Captive" = "#493D9E",
"Wild" = "#5D8233",
"Seized" = "#ECD662")
all_theme <- c(axis.text.y = element_text(size = 12),
axis.text.x = element_text(size = 12),
panel.grid.major.y = element_line(color = "grey", linewidth=0.2),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
legend.title = element_text(size = 10),
legend.text = element_text(size = 8),
strip.background = element_rect(colour = NA, fill = "grey"),
plot.margin = ggplot2::margin(1, 1, 1, 1, unit = "cm"))
Alpha D and Read Depth
Alpha Diversity
# Define the pairs to compare
my_comparisons <- list(
c("Captive", "Wild"),
c("Captive", "Seized Probiotics"),
c("Wild", "Seized Probiotics"),
c("Captive", "Seized No Probiotics"),
c("Wild", "Seized No Probiotics"),
c("Seized Probiotics", "Seized No Probiotics"))
# Placeholder to store plots
alpha_plots <- lapply(names(phyloseq_list), function(name) {
phy <- phyloseq_list[[name]]
GP <- prune_taxa(taxa_sums(phy) > 0, phy) # Prune zero abundance taxa
alpha <- plot_richness(GP, x = "captive_wild_pro",
color = "captive_wild_pro",
measures = c("Chao1", "Shannon", "Simpson")) +
geom_boxplot(size = 1, alpha = 0.5) +
scale_color_manual(values = c("#493D9E", "#EAD672", "#EFC650", "#5D8233")) +
theme_bw() +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
plot.caption = element_text(size = 9, hjust = 1)) +
stat_compare_means(
comparisons = my_comparisons,
method = "t.test",
label = "p.signif",
size = 3) +
labs(title = paste("Alpha Diversity for: ", name),
caption = "Significance (t-test): ns: p > 0.05 *: p ≤ 0.05 **: p ≤ 0.01 ***: p ≤ 0.001 ****: p ≤ 0.0001")
return(alpha)
})
## Warning in estimate_richness(physeq, split = TRUE, measures = measures): The data you have provided does not have
## any singletons. This is highly suspicious. Results of richness
## estimates (for example) are probably unreliable, or wrong, if you have already
## trimmed low-abundance taxa from the data.
##
## We recommended that you find the un-trimmed data and retry.
## Warning in estimate_richness(physeq, split = TRUE, measures = measures): The data you have provided does not have
## any singletons. This is highly suspicious. Results of richness
## estimates (for example) are probably unreliable, or wrong, if you have already
## trimmed low-abundance taxa from the data.
##
## We recommended that you find the un-trimmed data and retry.
## Warning in estimate_richness(physeq, split = TRUE, measures = measures): The data you have provided does not have
## any singletons. This is highly suspicious. Results of richness
## estimates (for example) are probably unreliable, or wrong, if you have already
## trimmed low-abundance taxa from the data.
##
## We recommended that you find the un-trimmed data and retry.
## Warning in estimate_richness(physeq, split = TRUE, measures = measures): The data you have provided does not have
## any singletons. This is highly suspicious. Results of richness
## estimates (for example) are probably unreliable, or wrong, if you have already
## trimmed low-abundance taxa from the data.
##
## We recommended that you find the un-trimmed data and retry.
# Optional: print all plots
#for (p in alpha_plots) {
# print(p)
#}
wrap_plots(alpha_plots, nrow = 2, ncol = 2)

ANCOMBC
# Loaded ancombc results from the 05_EY_ANCOMBC.R script
load("05_taxa_driving_groups/ancom_16m_16n_18m_18m_trimmed.RData")
# Loop through datasets and taxonomic levels
ancombc_plots <- list()
for (dataset_name in names(anc_results_trimmed)) {
for (tax_level in names(anc_results_trimmed[[dataset_name]])) {
#cat("Fetching ancom results for:", dataset_name, "at", tax_level, "\n")
res_obj <- anc_results_trimmed[[dataset_name]][[tax_level]]
# Check if result object contains `res` element
if (!"res" %in% names(res_obj)) {
warning(paste("No 'res' in", dataset_name, tax_level))
next
}
res_df <- res_obj$res
res_df <- res_df %>%
dplyr::filter(`q_(Intercept)` < 0.05)
#cat("res_df size:", format(object.size(res_df), units = "MB"), "\n")
# Check for valid results
if (nrow(res_df) == 0) next
# Reshape the lfc, q, and diff values separately
lfc_df <- res_df %>%
dplyr::select(taxon, starts_with("lfc_")) %>%
pivot_longer(cols = -taxon, names_to = "group", values_to = "lfc") %>%
dplyr::mutate(group = sub("lfc_", "", group))
q_df <- res_df %>%
dplyr::select(taxon, starts_with("q_")) %>%
pivot_longer(cols = -taxon, names_to = "group", values_to = "q") %>%
dplyr::mutate(group = sub("q_", "", group))
diff_df <- res_df %>%
dplyr::select(taxon, starts_with("diff_")) %>%
pivot_longer(cols = -taxon, names_to = "group", values_to = "diff") %>%
dplyr::mutate(group = sub("diff_", "", group))
# Join all into a single long-form dataframe
plot_df <- reduce(list(lfc_df, q_df, diff_df),
left_join, by = c("taxon", "group"))
# Filter significant results
plot_df <- plot_df %>% filter(diff == TRUE, q < 0.05)
if (nrow(plot_df) == 0) next # Skip if no significant taxa
# Clean and format for plotting
plot_df$group <- str_replace_all(plot_df$group, "captive_wild", "")
plot_df$taxon <- gsub("_", " ", plot_df$taxon)
plot_df <- plot_df %>%
dplyr::mutate(group = ifelse(group == "(Intercept)", "Captive", group))
plot_df$group <- sub("^Captive\\.", "", plot_df$group)
plot_df$group <- sub("WildWild", "Wild", plot_df$group)
#cat("plot_df size:", format(object.size(plot_df), units = "MB"), "\n")
# Create plot
p <- ggplot(plot_df, aes(x = reorder(taxon, lfc), y = lfc, fill = group)) +
geom_col(position = position_dodge(width = 0.9)) +
coord_flip() +
labs(title = paste("Significant DA (ANCOMBC2):", dataset_name, "-", tax_level),
x = "Taxon", y = "Log Fold Change", fill = "Group") +
scale_fill_manual(
values = c("Captive" = "#493D9E",
"Wild" = "#EAD672",
"Seized" = "#5D8233")) +
theme_bw(base_size = 12) +
theme(plot.title = element_text(size = 12, face = "bold"),
axis.title = element_text(size = 10),
axis.text = element_text(size = 10),
legend.title = element_text(size = 10),
legend.text = element_text(size = 10))
ancombc_plots[[dataset_name]][[tax_level]] <- p
# Define output file name
output_dir <- "05_taxa_driving_groups"
plot_file <- file.path(output_dir,
paste0(dataset_name, "_ancombc_all_", tolower(tax_level), ".png"))
ggsave(filename = plot_file, plot = p, height = 15, width = 18)
}
}
library(patchwork)
p1 <- ancombc_plots$phyloseq16m$family
p2 <- ancombc_plots$phyloseq16m$genus + theme(legend.position = "none")
p3 <- ancombc_plots$phyloseq16m$species + theme(legend.position = "none")
p4 <- ancombc_plots$phyloseq16n$family
p5 <- ancombc_plots$phyloseq16n$genus + theme(legend.position = "none")
p6 <- ancombc_plots$phyloseq16n$species + theme(legend.position = "none")
p7 <- ancombc_plots$phyloseq18m$family
p8 <- ancombc_plots$phyloseq18m$genus + theme(legend.position = "none")
p9 <- ancombc_plots$phyloseq18m$species + theme(legend.position = "none")
p10 <- ancombc_plots$phyloseq18n$family
p11 <- ancombc_plots$phyloseq18n$genus + theme(legend.position = "none")
p12 <- ancombc_plots$phyloseq18n$species + theme(legend.position = "none")
# Group plots into rows
row1 <- p1 | p2 | p3
row2 <- p4 | p5 | p6
row3 <- p7 | p8 | p9
row4 <- p10 | p11 | p12
rm(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12)
# Combine rows and control relative height
(row1 / row2 / row3 / row4) +
plot_layout(heights = c(4.5, 4.5, 2.5, 1)) &
theme(plot.margin = margin(t = 10, r = 10, b = 10, l = 10))

ANCOMBC VENN DIAGRAMS
# Define your datasets
datasets <- c("phyloseq16m", "phyloseq16n", "phyloseq18m", "phyloseq18n")
levels <- c("family", "genus", "species")
venn_plots_anc_levels <- list()
for (level in levels) {
venn_plots_anc_levels[[level]] <- process_tax_level(level, datasets)
}
grobs <- lapply(venn_plots_anc_levels, function(x) {
grid.grabExpr(grid.draw(x[[1]]))
})
grobs_margined <- lapply(grobs, function(g) {
grobTree(g, vp = viewport(width = 0.7, height = 0.4))
})
grid.arrange(grobs = grobs_margined, ncol = 3,
top = textGrob("ANCOMBC Feature Overlaps", gp = gpar(fontsize = 16, fontface = "bold")))

LDA
# load lda results from the 05_ET_LDA.R script
load("05_taxa_driving_groups/LDA/lda4_16m_16n_18m_18n.RData")
# Initialize a list to store the plots (optional)
diffbox_plots <- list()
for (taxrank in names(lda_diff_results_2)) { # (Species, Genus, Family)
for (dataset in names(lda_diff_results_2[[taxrank]])) {
deres <- lda_diff_results_2[[taxrank]][[dataset]]
# Select top 30 features by LDAmean
top30_feat <- deres@result %>%
arrange(desc(LDAmean)) %>%
slice_head(n = 30) %>%
pull(f)
# Create a new object containing only the top 30 features
deres_top30 <- deres
deres_top30@result <- deres@result %>%
filter(f %in% top30_feat)
#deres_top30@result$f <- gsub("__", ": ", deres_top30@result$f)
deres_top30@result$label <- gsub("__", ": ", deres_top30@result$f)
diffbox <- ggdiffbox(
obj = deres_top30,
box_notch = FALSE,
lineheight = 0.1,
linewidth = 0.3,
colorlist = colorlist,
l_xlabtext = "Relative Abundance"
)
# Optional: relabel facets or axes after plot is created
diffbox <- diffbox + scale_y_discrete(labels = deres_top30@result$label)
# Save or store the plot
plot_name <- paste0("diffbox_", taxrank, "_", dataset)
diffbox_plots[[taxrank]][[dataset]] <- diffbox
# Optionally: Print or save plot
#print(diffbox)
ggsave(paste0("05_taxa_driving_groups/LDA/", plot_name, ".png"),
diffbox, width = 8, height = 6, dpi = 300)
}
}
all_lda_plots <- unlist(diffbox_plots, recursive = FALSE)
species <- all_lda_plots[1:4]
genus <- all_lda_plots[5:8]
family <- all_lda_plots[9:12]
Family level LDA
wrap_plots(family, nrow = 2, ncol = 2)

Genus level LDA
wrap_plots(genus, nrow = 2, ncol = 2)

Species level LDA
wrap_plots(species, nrow = 2, ncol = 2)

Combined ANCOMBC LDA plots
#### Taxa overlap
strip_prefix <- function(x) sub("^[a-z]__+", "", x)
lda_diff_results_2[["Genus"]]$phyloseq16n@result$f <- strip_prefix(lda_diff_results_2[["Genus"]]$phyloseq16n@result$f)
lda_genus_16n_feature_list <- unique(lda_diff_results_2[["Genus"]]$phyloseq16n@result$f) %>% as.character()
anc_genus_16n_feature_list <- unique(ancombc_plots[["phyloseq16n"]]$genus$data$taxon) %>% as.character()
# Find overlap
overlap <- intersect(lda_genus_16n_feature_list, anc_genus_16n_feature_list)
print(overlap)
## [1] "Bifidobacteriaceae" "Clostridium sensu stricto 1"
## [3] "Helicobacter" "Ligilactobacillus"
## [5] "Bacilli" "Clostridia"
unique_to_ancombc <- setdiff(anc_genus_16n_feature_list, overlap)
unique_to_lda <- setdiff(lda_genus_16n_feature_list, overlap)
ordered_features_ancombc <- c(overlap, unique_to_ancombc)
ordered_features_all <- c(overlap, unique_to_ancombc, unique_to_lda)
# Plot p1
lda_genus_16n <- lda_diff_results_2[["Genus"]]$phyloseq16n
# Replace only column names
colnames(lda_genus_16n@originalD) <- strip_prefix(colnames(lda_genus_16n@originalD))
p1 <- make_p1_boxplot(
obj = lda_genus_16n,
featurelist = ordered_features_all,
colorlist = colorlist,
box_notch = FALSE,
xlabtext = "Relative Abundance")
## Aesthetic mapping:
## * `colour` -> `captive_wild`
## * `x` -> `feature`
## * `y` -> `value`
p1

p1_ord <- make_p1_boxplot_ordered(
obj = lda_genus_16n,
featurelist = ordered_features_all,
colorlist = colorlist,
box_notch = FALSE,
xlabtext = "Relative Abundance")
p1_ord

# Plot effect size (p2)
# Add missing rows to nodedfres_all (for p2):
# Fill missing features for effect size plot with NA
nodedfres_all <- lda_genus_16n@result
missing_in_p2 <- setdiff(ordered_features_all, nodedfres_all$f)
if (length(missing_in_p2) > 0) {
na_rows <- data.frame(f = missing_in_p2)
# Fill other columns with NA or suitable values
na_rows[, setdiff(colnames(nodedfres_all), "f")] <- NA
nodedfres_all <- rbind(nodedfres_all, na_rows)
}
# Reorder levels to match p1
nodedfres_all$f <- factor(nodedfres_all$f, levels = rev(ordered_features_all))
p2 <- make_p2_effectsize(obj = lda_genus_16n,
nodedfres = nodedfres_all,
colorlist = colorlist,
xlim_range = c(2.5, 5.5))
## The color has been set automatically, you can reset it manually by adding scale_color_manual(values=yourcolors)
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
p2 <- p2 + theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank())
p2
## Warning: Removed 27 rows containing missing values or values outside the scale range
## (`geom_errorbarh()`).
## Warning: Removed 27 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Plot p3 ancombc
# Add missing rows to anc_plot_dfs[["phyloseq16n"]]$genus (for p3):
anc_df <- ancombc_plots[["phyloseq16n"]]$genus[["data"]]
missing_in_p3 <- setdiff(ordered_features_all, anc_df$taxon)
if (length(missing_in_p3) > 0) {
na_rows <- data.frame(taxon = missing_in_p3)
na_rows[, setdiff(colnames(anc_df), "taxon")] <- NA
anc_df <- rbind(anc_df, na_rows)
}
anc_df$taxon <- factor(anc_df$taxon, levels = rev(ordered_features_all))
p3 <- make_p3_ancombc(ancombc_df = anc_df,
dataset_name = "16N",
tax_level = "Genus",
featurelist = ordered_features_all,
colorlist = colorlist)
p3 <- p3 + theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
plot.title = element_blank())
p3
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_col()`).

p1 | p2 | p3
## Warning: Removed 27 rows containing missing values or values outside the scale range
## (`geom_errorbarh()`).
## Removed 27 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_col()`).
